import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor
Data=pd.read_csv("/content/drive/MyDrive/Datasets/AnticipateBuildingConsumptionNeeds/2016_Building_Energy_Benchmarking.csv")
Data.shape
(3376, 46)
print(Data.isnull().sum().sum()," mssing number out of ",Data.isnull().sum().sum()+Data.notna().sum().sum())
19952 mssing number out of 155296
print(np.round((Data.isnull().sum().sum()*100) /((Data.isnull().sum().sum()+Data.notna().sum().sum()))), "% of missing data")
13.0 % of missing data
Data.head(3)
OSEBuildingID | DataYear | BuildingType | PrimaryPropertyType | PropertyName | Address | City | State | ZipCode | TaxParcelIdentificationNumber | ... | Electricity(kWh) | Electricity(kBtu) | NaturalGas(therms) | NaturalGas(kBtu) | DefaultData | Comments | ComplianceStatus | Outlier | TotalGHGEmissions | GHGEmissionsIntensity | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2016 | NonResidential | Hotel | Mayflower park hotel | 405 Olive way | Seattle | WA | 98101.0 | 0659000030 | ... | 1.156514e+06 | 3946027.0 | 12764.52930 | 1276453.0 | False | NaN | Compliant | NaN | 249.98 | 2.83 |
1 | 2 | 2016 | NonResidential | Hotel | Paramount Hotel | 724 Pine street | Seattle | WA | 98101.0 | 0659000220 | ... | 9.504252e+05 | 3242851.0 | 51450.81641 | 5145082.0 | False | NaN | Compliant | NaN | 295.86 | 2.86 |
2 | 3 | 2016 | NonResidential | Hotel | 5673-The Westin Seattle | 1900 5th Avenue | Seattle | WA | 98101.0 | 0659000475 | ... | 1.451544e+07 | 49526664.0 | 14938.00000 | 1493800.0 | False | NaN | Compliant | NaN | 2089.28 | 2.19 |
3 rows × 46 columns
Data.describe(exclude="object")
OSEBuildingID | DataYear | ZipCode | CouncilDistrictCode | Latitude | Longitude | YearBuilt | NumberofBuildings | NumberofFloors | PropertyGFATotal | ... | SiteEnergyUseWN(kBtu) | SteamUse(kBtu) | Electricity(kWh) | Electricity(kBtu) | NaturalGas(therms) | NaturalGas(kBtu) | DefaultData | Comments | TotalGHGEmissions | GHGEmissionsIntensity | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3376.000000 | 3376.0 | 3360.000000 | 3376.000000 | 3376.000000 | 3376.000000 | 3376.000000 | 3368.000000 | 3376.000000 | 3.376000e+03 | ... | 3.370000e+03 | 3.367000e+03 | 3.367000e+03 | 3.367000e+03 | 3.367000e+03 | 3.367000e+03 | 3376 | 0.0 | 3367.000000 | 3367.000000 |
unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 2 | NaN | NaN | NaN |
top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | False | NaN | NaN | NaN |
freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 3263 | NaN | NaN | NaN |
mean | 21208.991114 | 2016.0 | 98116.949107 | 4.439277 | 47.624033 | -122.334795 | 1968.573164 | 1.106888 | 4.709123 | 9.483354e+04 | ... | 5.276726e+06 | 2.745959e+05 | 1.086639e+06 | 3.707612e+06 | 1.368505e+04 | 1.368505e+06 | NaN | NaN | 119.723971 | 1.175916 |
std | 12223.757015 | 0.0 | 18.615205 | 2.120625 | 0.047758 | 0.027203 | 33.088156 | 2.108402 | 5.494465 | 2.188376e+05 | ... | 1.593879e+07 | 3.912173e+06 | 4.352478e+06 | 1.485066e+07 | 6.709781e+04 | 6.709781e+06 | NaN | NaN | 538.832227 | 1.821452 |
min | 1.000000 | 2016.0 | 98006.000000 | 1.000000 | 47.499170 | -122.414250 | 1900.000000 | 0.000000 | 0.000000 | 1.128500e+04 | ... | 0.000000e+00 | 0.000000e+00 | -3.382680e+04 | -1.154170e+05 | 0.000000e+00 | 0.000000e+00 | NaN | NaN | -0.800000 | -0.020000 |
25% | 19990.750000 | 2016.0 | 98105.000000 | 3.000000 | 47.599860 | -122.350662 | 1948.000000 | 1.000000 | 2.000000 | 2.848700e+04 | ... | 9.701822e+05 | 0.000000e+00 | 1.874229e+05 | 6.394870e+05 | 0.000000e+00 | 0.000000e+00 | NaN | NaN | 9.495000 | 0.210000 |
50% | 23112.000000 | 2016.0 | 98115.000000 | 4.000000 | 47.618675 | -122.332495 | 1975.000000 | 1.000000 | 4.000000 | 4.417500e+04 | ... | 1.904452e+06 | 0.000000e+00 | 3.451299e+05 | 1.177583e+06 | 3.237538e+03 | 3.237540e+05 | NaN | NaN | 33.920000 | 0.610000 |
75% | 25994.250000 | 2016.0 | 98122.000000 | 7.000000 | 47.657115 | -122.319407 | 1997.000000 | 1.000000 | 5.000000 | 9.099200e+04 | ... | 4.381429e+06 | 0.000000e+00 | 8.293178e+05 | 2.829632e+06 | 1.189033e+04 | 1.189034e+06 | NaN | NaN | 93.940000 | 1.370000 |
max | 50226.000000 | 2016.0 | 98272.000000 | 7.000000 | 47.733870 | -122.220966 | 2015.000000 | 111.000000 | 99.000000 | 9.320156e+06 | ... | 4.716139e+08 | 1.349435e+08 | 1.925775e+08 | 6.570744e+08 | 2.979090e+06 | 2.979090e+08 | NaN | NaN | 16870.980000 | 34.090000 |
11 rows × 31 columns
print(Data.describe(exclude="object").columns.tolist())
print(len(Data.describe(exclude="object").columns.tolist()))
['OSEBuildingID', 'DataYear', 'ZipCode', 'CouncilDistrictCode', 'Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseTypeGFA', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)', 'DefaultData', 'Comments', 'TotalGHGEmissions', 'GHGEmissionsIntensity'] 31
print(Data.drop(Data.describe(exclude="object").columns.tolist(),axis=1).columns.tolist())
print(len(Data.drop(Data.describe(exclude="object").columns.tolist(),axis=1).columns.tolist()))
['BuildingType', 'PrimaryPropertyType', 'PropertyName', 'Address', 'City', 'State', 'TaxParcelIdentificationNumber', 'Neighborhood', 'ListOfAllPropertyUseTypes', 'LargestPropertyUseType', 'SecondLargestPropertyUseType', 'ThirdLargestPropertyUseType', 'YearsENERGYSTARCertified', 'ComplianceStatus', 'Outlier'] 15
Data['BuildingType'].unique()
array(['NonResidential', 'Nonresidential COS', 'Multifamily MR (5-9)', 'SPS-District K-12', 'Campus', 'Multifamily LR (1-4)', 'Multifamily HR (10+)', 'Nonresidential WA'], dtype=object)
print("NonResidential:", len(Data[Data['BuildingType'].isin(["NonResidential"])]))
NonResidential: 1460
print("Nonresidential COS:", len(Data[Data['BuildingType'].isin(["Nonresidential COS"])]))
Nonresidential COS: 85
print("Nonresidential WA:", len(Data[Data['BuildingType'].isin(["Nonresidential WA"])]))
Nonresidential WA: 1
DataNR=Data[Data['BuildingType'].isin(["NonResidential","Nonresidential COS","Nonresidential WA"])].copy()
print(DataNR.describe(exclude="object").columns.tolist())
['OSEBuildingID', 'DataYear', 'ZipCode', 'CouncilDistrictCode', 'Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseTypeGFA', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 'SiteEnergyUse(kBtu)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)', 'DefaultData', 'Comments', 'TotalGHGEmissions', 'GHGEmissionsIntensity']
SiteEnergyUse(kBtu):The annual amount of energy consumed by the property from all sources of energy.
TotalGHGEmissions:The total amount of greenhouse gas emissions, including carbon dioxide, methane, and nitrous oxide gases released into the atmosphere as a result of energy consumption at the property
DataNR['PrimaryPropertyType'].unique()
array(['Hotel', 'Other', 'Mixed Use Property', 'University', 'Small- and Mid-Sized Office', 'Self-Storage Facility', 'Warehouse', 'K-12 School', 'Large Office', 'Senior Care Community', 'Medical Office', 'Retail Store', 'Hospital', 'Residence Hall', 'Distribution Center', 'Worship Facility', 'Supermarket / Grocery Store', 'Laboratory', 'Refrigerated Warehouse', 'Restaurant', 'Low-Rise Multifamily', 'Office'], dtype=object)
DataNR['NumberofBuildings'].unique()
array([ 1., 3., 0., 2., 4., 6., 9., 5., nan, 7., 8.])
DataNR['PropertyGFATotal'].unique()
array([ 88434, 103566, 956110, ..., 13157, 14101, 18258])
DataNR['Neighborhood'].unique()
array(['DOWNTOWN', 'NORTHEAST', 'EAST', 'LAKE UNION', 'GREATER DUWAMISH', 'BALLARD', 'NORTHWEST', 'MAGNOLIA / QUEEN ANNE', 'CENTRAL', 'SOUTHWEST', 'SOUTHEAST', 'NORTH', 'DELRIDGE', 'North', 'Delridge', 'Ballard', 'Northwest', 'Central', 'DELRIDGE NEIGHBORHOODS'], dtype=object)
DataNRCO2E=DataNR[['OSEBuildingID','PrimaryPropertyType','Latitude', 'Longitude','Neighborhood', 'YearBuilt','NumberofBuildings','NumberofFloors','TotalGHGEmissions','SiteEnergyUse(kBtu)','ENERGYSTARScore']].copy()
DataNRCO2E['BuildingAge']=2016-DataNRCO2E['YearBuilt']
DataNRCO2E
OSEBuildingID | PrimaryPropertyType | Latitude | Longitude | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | TotalGHGEmissions | SiteEnergyUse(kBtu) | ENERGYSTARScore | BuildingAge | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Hotel | 47.61220 | -122.33799 | DOWNTOWN | 1927 | 1.0 | 12 | 249.98 | 7.226362e+06 | 60.0 | 89 |
1 | 2 | Hotel | 47.61317 | -122.33393 | DOWNTOWN | 1996 | 1.0 | 11 | 295.86 | 8.387933e+06 | 61.0 | 20 |
2 | 3 | Hotel | 47.61393 | -122.33810 | DOWNTOWN | 1969 | 1.0 | 41 | 2089.28 | 7.258702e+07 | 43.0 | 47 |
3 | 5 | Hotel | 47.61412 | -122.33664 | DOWNTOWN | 1926 | 1.0 | 10 | 286.43 | 6.794584e+06 | 56.0 | 90 |
4 | 8 | Hotel | 47.61375 | -122.34047 | DOWNTOWN | 1980 | 1.0 | 18 | 505.01 | 1.417261e+07 | 75.0 | 36 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3371 | 50222 | Office | 47.56722 | -122.31154 | GREATER DUWAMISH | 1990 | 1.0 | 1 | 20.94 | 8.497457e+05 | 46.0 | 26 |
3372 | 50223 | Other | 47.59625 | -122.32283 | DOWNTOWN | 2004 | 1.0 | 1 | 32.17 | 9.502762e+05 | NaN | 12 |
3373 | 50224 | Other | 47.63644 | -122.35784 | MAGNOLIA / QUEEN ANNE | 1974 | 1.0 | 1 | 223.54 | 5.765898e+06 | NaN | 42 |
3374 | 50225 | Mixed Use Property | 47.52832 | -122.32431 | GREATER DUWAMISH | 1989 | 1.0 | 1 | 22.11 | 7.194712e+05 | NaN | 27 |
3375 | 50226 | Mixed Use Property | 47.53939 | -122.29536 | GREATER DUWAMISH | 1938 | 1.0 | 1 | 41.27 | 1.152896e+06 | NaN | 78 |
1546 rows × 12 columns
print(DataNRCO2E.isnull().sum().sum())
DataNRCO2E.dropna(inplace=True)
546
DataNRCO2E
OSEBuildingID | PrimaryPropertyType | Latitude | Longitude | Neighborhood | YearBuilt | NumberofBuildings | NumberofFloors | TotalGHGEmissions | SiteEnergyUse(kBtu) | ENERGYSTARScore | BuildingAge | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Hotel | 47.61220 | -122.33799 | DOWNTOWN | 1927 | 1.0 | 12 | 249.98 | 7.226362e+06 | 60.0 | 89 |
1 | 2 | Hotel | 47.61317 | -122.33393 | DOWNTOWN | 1996 | 1.0 | 11 | 295.86 | 8.387933e+06 | 61.0 | 20 |
2 | 3 | Hotel | 47.61393 | -122.33810 | DOWNTOWN | 1969 | 1.0 | 41 | 2089.28 | 7.258702e+07 | 43.0 | 47 |
3 | 5 | Hotel | 47.61412 | -122.33664 | DOWNTOWN | 1926 | 1.0 | 10 | 286.43 | 6.794584e+06 | 56.0 | 90 |
4 | 8 | Hotel | 47.61375 | -122.34047 | DOWNTOWN | 1980 | 1.0 | 18 | 505.01 | 1.417261e+07 | 75.0 | 36 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3339 | 50069 | Small- and Mid-Sized Office | 47.53161 | -122.29944 | GREATER DUWAMISH | 1929 | 1.0 | 2 | 134.80 | 4.420650e+06 | 9.0 | 87 |
3347 | 50081 | K-12 School | 47.58831 | -122.30650 | GREATER DUWAMISH | 2015 | 1.0 | 3 | 9.24 | 1.325973e+06 | 77.0 | 1 |
3366 | 50210 | Office | 47.63572 | -122.37525 | MAGNOLIA / QUEEN ANNE | 1952 | 1.0 | 1 | 3.50 | 5.026677e+05 | 75.0 | 64 |
3369 | 50220 | Office | 47.56440 | -122.27813 | SOUTHEAST | 1960 | 1.0 | 1 | 7.79 | 3.878100e+05 | 93.0 | 56 |
3371 | 50222 | Office | 47.56722 | -122.31154 | GREATER DUWAMISH | 1990 | 1.0 | 1 | 20.94 | 8.497457e+05 | 46.0 | 26 |
1006 rows × 12 columns
We have categorical features we need to transform them with One Hot Encoder
DataNRCO2E.describe(include="object")
PrimaryPropertyType | Neighborhood | |
---|---|---|
count | 1006 | 1006 |
unique | 18 | 18 |
top | Small- and Mid-Sized Office | DOWNTOWN |
freq | 238 | 249 |
DataNRCO2E.drop(DataNRCO2E[DataNRCO2E["PrimaryPropertyType"].isin(['Low-Rise Multifamily','Residence Hall'])].index,axis=0,inplace=True)
DataNRCO2E.skew()
<ipython-input-27-3b9fa3907873>:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction. DataNRCO2E.skew()
OSEBuildingID 0.345459 Latitude 0.247815 Longitude 0.011083 YearBuilt -0.394848 NumberofBuildings 10.418149 NumberofFloors 4.874361 TotalGHGEmissions 12.471995 SiteEnergyUse(kBtu) 8.861597 ENERGYSTARScore -0.675025 BuildingAge 0.394848 dtype: float64
DataNRCO2E.hist(alpha=0.5, figsize=(20, 10))
plt.tight_layout()
SiteEnergyUse(kBtu) after log transformation
DataNRCO2E['SiteEnergyUse(kBtu)']=np.log1p(DataNRCO2E['SiteEnergyUse(kBtu)'])
sns.distplot(DataNRCO2E['SiteEnergyUse(kBtu)'])
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='SiteEnergyUse(kBtu)', ylabel='Density'>
TotalGHGEmissions after log transformation
DataNRCO2E["TotalGHGEmissions"]=np.log1p(DataNRCO2E["TotalGHGEmissions"])
sns.distplot(DataNRCO2E["TotalGHGEmissions"])
/usr/local/lib/python3.8/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='TotalGHGEmissions', ylabel='Density'>
from numpy import mean, std, absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
X,y=DataNRCO2E.drop(["OSEBuildingID","TotalGHGEmissions","SiteEnergyUse(kBtu)"], axis=1), DataNRCO2E["TotalGHGEmissions"]
print(X.shape, y.shape)
(986, 9) (986,)
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', StandardScaler(), numerical_ix.values)]
col_transform = ColumnTransformer(transformers=t)
col_transform
ColumnTransformer(transformers=[('cat', OneHotEncoder(), Index(['PrimaryPropertyType', 'Neighborhood'], dtype='object')), ('num', StandardScaler(), array(['Latitude', 'Longitude', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'ENERGYSTARScore', 'BuildingAge'], dtype=object))])
# define the models
modelDummyReg= DummyRegressor(strategy="mean")
modelLR= LinearRegression()
modelSVR = SVR(kernel='rbf',gamma='scale',C=100)
modelXGB_T1 = XGBRegressor()
# define the data preparation and modeling pipeline
pipelineDummyReg = Pipeline(steps=[('prep',col_transform), ('m', modelDummyReg)])
pipelineLR = Pipeline(steps=[('prep',col_transform), ('m', modelLR)])
pipelineSVR = Pipeline(steps=[('prep',col_transform), ('m', modelSVR)])
pipelineXGB = Pipeline(steps=[('prep',col_transform), ('m', modelXGB_T1)])
# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresDummyReg = absolute(scoresDummyReg)
# summarize the model performance
print('DummyReg MAE: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))
DummyReg MAE: 1.151 (0.091)
# evaluate the pipeline using cross validation and calculate MAE
scoresLR = cross_val_score(pipelineLR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresLR = absolute(scoresLR)
# summarize the model performance
print('LinearRegression MAE: %.3f (%.3f)' % (mean(scoresLR), std(scoresLR)))
LinearRegression MAE: 0.841 (0.066)
# evaluate the pipeline using cross validation and calculate MAE
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresSVR = absolute(scoresSVR)
# summarize the model performance
print('SVR MAE: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))
SVR MAE: 1.005 (0.062)
# evaluate the pipeline using cross validation and calculate MAE
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresXGB = absolute(scoresXGB)
# summarize the model performance
print('XGB MAE: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))
XGB MAE: 0.806 (0.088)
# evaluate the pipeline using cross validation and calculate R2
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('DummyReg r2: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))
DummyReg r2: -0.010 (0.010)
# evaluate the pipeline using cross validation and calculate R2
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('SVR r2: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))
SVR r2: 0.158 (0.128)
# evaluate the pipeline using cross validation and calculate r2
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('XGB r2: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))
XGB r2: 0.468 (0.082)
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
transformer = make_column_transformer(
(OneHotEncoder(), ['PrimaryPropertyType', 'Neighborhood']),
remainder='passthrough')
transformed = transformer.fit_transform(X).toarray()
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
X_train,X_test,y_train,y_test = train_test_split(transformed_df,y,test_size=0.3,random_state=42)
modelXGB_T1.fit(X_train,y_train)
[09:32:27] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
XGBRegressor()
ypred=modelXGB_T1.predict(X_test)
modelXGB_T1.score(X_test,ypred)
1.0
pd.DataFrame(np.expm1(modelXGB_T1.predict(transformed_df)),columns=['ypred']).join(np.expm1(y.reset_index(drop=True)))
ypred | TotalGHGEmissions | |
---|---|---|
0 | 319.701172 | 249.98 |
1 | 304.534607 | 295.86 |
2 | 990.298706 | 2089.28 |
3 | 319.701172 | 286.43 |
4 | 342.030884 | 505.01 |
... | ... | ... |
981 | 26.405626 | 134.80 |
982 | 24.814886 | 9.24 |
983 | 28.213591 | 3.50 |
984 | 22.537758 | 7.79 |
985 | 30.965725 | 20.94 |
986 rows × 2 columns
pd.DataFrame(data=modelXGB_T1.feature_importances_).nlargest(10,0)
0 | |
---|---|
38 | 0.152405 |
12 | 0.123078 |
18 | 0.082017 |
13 | 0.072722 |
14 | 0.053095 |
39 | 0.043498 |
11 | 0.031702 |
4 | 0.031545 |
8 | 0.031119 |
0 | 0.028707 |
pd.DataFrame(transformed_df.columns).T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | onehotencoder__PrimaryPropertyType_Distributio... | onehotencoder__PrimaryPropertyType_Hospital | onehotencoder__PrimaryPropertyType_Hotel | onehotencoder__PrimaryPropertyType_K-12 School | onehotencoder__PrimaryPropertyType_Large Office | onehotencoder__PrimaryPropertyType_Medical Office | onehotencoder__PrimaryPropertyType_Mixed Use P... | onehotencoder__PrimaryPropertyType_Office | onehotencoder__PrimaryPropertyType_Other | onehotencoder__PrimaryPropertyType_Refrigerate... | ... | onehotencoder__Neighborhood_Northwest | onehotencoder__Neighborhood_SOUTHEAST | onehotencoder__Neighborhood_SOUTHWEST | remainder__Latitude | remainder__Longitude | remainder__YearBuilt | remainder__NumberofBuildings | remainder__NumberofFloors | remainder__ENERGYSTARScore | remainder__BuildingAge |
1 rows × 41 columns
Energy StarScore is just the 6th in the top ten of feature importance. It is not really usefull.
X,y=DataNRCO2E.drop(["OSEBuildingID","TotalGHGEmissions","SiteEnergyUse(kBtu)"], axis=1), DataNRCO2E["SiteEnergyUse(kBtu)"]
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', StandardScaler(), numerical_ix.values)]
col_transform = ColumnTransformer(transformers=t)
# define the models
modelDummyReg= DummyRegressor(strategy="mean")
modelLR= LinearRegression()
modelSVR = SVR(kernel='rbf',gamma='scale',C=100)
modelXGB_T2 = XGBRegressor()
# define the data preparation and modeling pipeline
pipelineDummyReg = Pipeline(steps=[('prep',col_transform), ('m', modelDummyReg)])
pipelineLR = Pipeline(steps=[('prep',col_transform), ('m', modelLR)])
pipelineSVR = Pipeline(steps=[('prep',col_transform), ('m', modelSVR)])
pipelineXGB = Pipeline(steps=[('prep',col_transform), ('m', modelXGB_T2)])
# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresDummyReg = absolute(scoresDummyReg)
# summarize the model performance
print('DummyReg MAE: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))
# evaluate the pipeline using cross validation and calculate MAE
scoresLR = cross_val_score(pipelineLR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresLR = absolute(scoresLR)
# summarize the model performance
print('LinearRegression MAE: %.3f (%.3f)' % (mean(scoresLR), std(scoresLR)))
# evaluate the pipeline using cross validation and calculate MAE
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresSVR = absolute(scoresSVR)
# summarize the model performance
print('SVR MAE: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))
# evaluate the pipeline using cross validation and calculate MAE
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='neg_mean_absolute_error',error_score="raise", cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scoresXGB = absolute(scoresXGB)
# summarize the model performance
print('XGB MAE: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))
DummyReg MAE: 1.066 (0.097) LinearRegression MAE: 0.611 (0.038) SVR MAE: 0.712 (0.070) XGB MAE: 0.577 (0.049)
# evaluate the pipeline using cross validation and calculate R2
scoresDummyReg = cross_val_score(pipelineDummyReg, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('DummyReg r2: %.3f (%.3f)' % (mean(scoresDummyReg), std(scoresDummyReg)))
# evaluate the pipeline using cross validation and calculate R2
scoresSVR = cross_val_score(pipelineSVR, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('SVR r2: %.3f (%.3f)' % (mean(scoresSVR), std(scoresSVR)))
# evaluate the pipeline using cross validation and calculate r2
scoresXGB = cross_val_score(pipelineXGB, X, y, scoring='r2',error_score="raise", cv=cv, n_jobs=-1)
# summarize the model performance
print('XGB r2: %.3f (%.3f)' % (mean(scoresXGB), std(scoresXGB)))
DummyReg r2: -0.011 (0.012) SVR r2: 0.454 (0.119) XGB r2: 0.648 (0.074)
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
transformer = make_column_transformer(
(OneHotEncoder(), ['PrimaryPropertyType', 'Neighborhood']),
remainder='passthrough')
transformed = transformer.fit_transform(X).toarray()
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
X_train,X_test,y_train,y_test = train_test_split(transformed_df,y,test_size=0.3,random_state=42)
modelXGB_T2.fit(X_train,y_train)
ypred=modelXGB_T2.predict(X_test)
modelXGB_T2.score(X_test,ypred)
[09:32:45] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
1.0
pd.DataFrame(np.expm1(modelXGB_T2.predict(transformed_df)),columns=['ypred']).join(np.expm1(y.reset_index(drop=True)))
ypred | SiteEnergyUse(kBtu) | |
---|---|---|
0 | 9.411165e+06 | 7.226363e+06 |
1 | 1.160892e+07 | 8.387933e+06 |
2 | 3.726919e+07 | 7.258702e+07 |
3 | 9.016754e+06 | 6.794584e+06 |
4 | 1.540815e+07 | 1.417261e+07 |
... | ... | ... |
981 | 1.798050e+06 | 4.420650e+06 |
982 | 2.200798e+06 | 1.325973e+06 |
983 | 1.222920e+06 | 5.026677e+05 |
984 | 7.690450e+05 | 3.878100e+05 |
985 | 1.485874e+06 | 8.497457e+05 |
986 rows × 2 columns